**********************************************************************************************************************************************************
*This do file codes GSO data and creates various IVs
*	a.	Input files: 
*		i.	GSO_CLEAN.dta
*		ii.	crosswalk_province.csv
*		iii.region.dta
*	b.	Output files:
*		i.	GSO_IV.dta
*		ii. GSO_IVnoncollapsed.dta
*		iii.GSO_IVagg.dta
*		iv.	GSO_IVregion.dta
***********************************************************************************************************************************************************

clear all
set more off
set mem 600m
set maxvar 3000

*SET DIRECTORY HERE*
global dir_data_original /Users/Jie/Dropbox (Personal)/CorruptionIncome/EJFinalSubmission/data&program/original raw data/
global dir_data_coded /Users/Jie/Dropbox (Personal)/CorruptionIncome/EJFinalSubmission/data&program/intermediary data/
global dir_data_analysis /Users/Jie/Dropbox (Personal)/CorruptionIncome/EJFinalSubmission/data&program/analysis data/

version 

******************************************************************************************************************************************************
use "${dir_data_coded}/GSO_CLEAN.dta",clear	

**KEEP LARGE PRIVATE FIRMS
keep if employ>10 & employ!=. & inrange(lhdn,7,11)

**GENERATE LHS VAR AND IV
bys jt: gen numfirms_jt=_N
bys rjt: gen numfirms_rjt=_N
g numfirms_r_jt=numfirms_jt-numfirms_rjt

bys jt: egen totalemploy_jt=total(employ)
g lntotalemploy_jt=ln(totalemploy_jt)
bys jt: egen meanemploy_jt=mean(employ)
g lnmeanemploy_jt=ln(meanemploy_jt)

bys rjt: egen totalemploy_rjt=total(employ)

g lntotalemploy_rjt=ln(totalemploy_rjt)

bys rjt: egen meanemploy_rjt=mean(employ)
g lnmeanemploy_rjt=ln(meanemploy_rjt)

bys rjt: egen medemploy_rjt=median(employ)
g lnmedemploy_rjt=ln(medemploy_rjt)

bys rjt: egen p25employ_rjt=pctile(employ),p(25)
g lnp25employ_rjt=ln(p25employ_rjt)

bys rjt: egen p75employ_rjt=pctile(employ),p(75)
g lnp75employ_rjt=ln(p75employ_rjt)

g totalemploy_r_jt=totalemploy_jt-totalemploy_rjt

g lntotalemploy_r_jt=ln(totalemploy_r_jt)

g meanemploy_r_jt=totalemploy_r_jt/numfirms_r_jt
g lnmeanemploy_r_jt=ln(meanemploy_r_jt)

bys rt: egen totalemploy_rt=total(employ)

drop if broad=="O"|broad=="T"|broad=="U"

save "${dir_data_analysis}/GSO_IVnoncollapsed.dta", replace

bys rjt:gen temp_num_rjt=_n
keep if temp_num_rjt==1

keep province year broad lntotalemploy_r_jt lntotalemploy_rjt totalemploy_r_jt totalemploy_rjt meanemploy_rjt lnmeanemploy_rjt lnmeanemploy_r_jt lntotalemploy_jt totalemploy_jt lnmeanemploy_jt meanemploy_jt totalemploy_rt

egen t=group(year)
egen r=group(province)
egen j=group(broad)
egen rj=group(province broad)
egen jt=group(broad year)

save "${dir_data_analysis}/GSO_IV.dta", replace


** GENERATE AGGREGATE IV
use "${dir_data_analysis}/GSO_IV.dta",clear

//average over all years of rj and -rj employment
bys province broad: egen avgtotalemploy_rj=mean(totalemploy_rjt)
bys province broad: egen avgtotalemploy_r_j=mean(totalemploy_r_jt)
g a1=totalemploy_r_jt/avgtotalemploy_r_j
g b1=totalemploy_rjt/avgtotalemploy_rj

//weights
g fraction=totalemploy_rjt/totalemploy_rt
bys province broad:egen avgfrac=mean(fraction)
	
g a2=a1*avgfrac
bys province year:egen IV_agg=sum(a2)
g b2=b1*avgfrac
bys province year:egen agg=sum(b2)

bys province year:g num_rt=_n
keep if num_rt==1

g lnIV_agg=ln(IV_agg)
g ln_agg=ln(agg)
keep province year IV_agg lnIV_agg agg ln_agg
sort province year

save "${dir_data_analysis}/GSO_IVagg.dta", replace
	
	
**GENERATE IV FOR other region-jt level
use "${dir_data_analysis}/GSO_IVnoncollapsed.dta", clear
tempfile GSO
save `GSO',replace
insheet using "${dir_data_original}/Crosswalks/crosswalk_province.csv", comma clear
merge 1:m province using `GSO'	
keep if _merge==3
drop _merge
drop if pci_id==.
drop if year==.
drop if broad==""

merge m:1 pci_id using "${dir_data_original}/Crosswalks/region"
keep if _m==3 
drop _m

egen regjt=group(region j t)

bys regjt: egen totalemploy_regjt=total(employ)

g lntotalemploy_regjt=ln(totalemploy_regjt)

g totalemploy_reg_jt=totalemploy_jt-totalemploy_regjt

g lntotalemploy_reg_jt=ln(totalemploy_reg_jt)

bys region broad year:g tag=_n
keep if tag==1

keep region broad year lntotalemploy_regjt lntotalemploy_reg_jt
	
sort region broad year

save "${dir_data_analysis}/GSO_IVregion.dta", replace

